"""
Import needed packages
"""
from RFS_settings import *
import RFS_Functions.Utilis as Util


"""
Load Needed Data
"""
# Load 1-Month Treasury Constant Maturity Rate (GS1M) to be later added to df_est
GS1M = pd.read_csv(os.path.join(data_path, 'GS1M.csv'))
GS1M['GS1M'] = GS1M['GS1M'] / 100
GS1M['GS1M'] = GS1M['GS1M'] + 1
GS1M.year = GS1M.year.astype(str)
GS1M.month = GS1M.month.astype(str)
GS1M['yearmonth'] = \
    pd.to_datetime(GS1M['year'].astype(str) + GS1M['month'].astype(str), format='%Y%m') + MonthEnd(0)
# Load and format separation rates from BLS data
BLS = pd.read_csv(os.path.join(data_path, 'bls_data.csv'))
BLS['year'] = BLS.Monthend.str[:4]
BLS['month'] = BLS.Monthend.str[-2:]
BLS['yearmonth'] = pd.to_datetime(BLS['year'].astype(str) + BLS['month'].astype(str), format='%Y%m') + MonthEnd(0)
BLS_vars = ['Totalseparations_Rate', 'Jobopenings_Rate', 'Hires_Rate']
for var in BLS_vars:
    BLS[var] = BLS[var] / 100
# Load job-postings data samples
df_dummies = pd.read_csv(os.path.join(data_path, "categorized_jobs_pseudo.csv"))


"""
Merge BLS rates to df_dummies
"""
df_dummies.Sector = pd.to_numeric(df_dummies.Sector)
df_dummies.JobDate = pd.to_datetime(df_dummies.JobDate)
df_dummies['year'] = df_dummies.JobDate.dt.year
df_dummies['month'] = df_dummies.JobDate.dt.month
df_dummies = df_dummies.drop(columns={'JobDate'})
df_dummies['yearmonth'] = pd.to_datetime(df_dummies['year'].astype(str) +
                                          df_dummies['month'].astype(str), format='%Y%m') + MonthEnd(0)
df_dummies = df_dummies.merge(BLS[['Sector', 'yearmonth'] + BLS_vars], on=['yearmonth', 'Sector'], how='left')
df_dummies = df_dummies.merge(BLS[['SectorName', 'yearmonth'] + BLS_vars], on=['yearmonth', 'SectorName'], how='left')
for var in BLS_vars:
    df_dummies[var] = df_dummies[var + '_x']
    df_dummies.loc[(df_dummies[var].isnull()) & (~df_dummies[var + '_y'].isnull()), var] = \
        df_dummies.loc[(df_dummies[var].isnull()) & (~df_dummies[var + '_y'].isnull()), var + '_y']
    df_dummies = df_dummies.drop([var + '_x', var + '_y'], axis=1)


"""
Generate Needed Variables
"""
print('---')
print('Starting analysis for sample:')
print(' - Sample size:', df_dummies.shape[0])
print(' - Unique employers:', df_dummies.Employer_ID.unique().shape[0])
# Remove observations for year 2007
df_dummies = df_dummies[df_dummies.year != 2007]
df_dummies['yearmonth'] = pd.to_datetime(df_dummies['year'].astype(str) +
                                     df_dummies['month'].astype(str), format='%Y%m') + MonthEnd(0)
# Generate percentage of employers graph
employers = df_dummies.groupby('year')['Employer_ID'].nunique().reset_index()
AI_empl = df_dummies[df_dummies.isAI > 0].drop_duplicates(['Employer_ID', 'year'])
empl_AI = AI_empl.groupby('year')['BGTJobId'].nunique().reset_index()
empl_AI = empl_AI.rename(columns={'BGTJobId': 'AI'})
employers = employers.merge(empl_AI, on='year')
OT_empl = df_dummies[df_dummies.isOldTech > 0].drop_duplicates(['Employer_ID', 'year'])
empl_OT = OT_empl.groupby('year')['BGTJobId'].nunique().reset_index()
empl_OT = empl_OT.rename(columns={'BGTJobId': 'OldTech'})
employers = employers.merge(empl_OT, on='year')
DM_empl = df_dummies[df_dummies.isDataMgmt > 0].drop_duplicates(['Employer_ID', 'year'])
empl_DM = DM_empl.groupby('year')['BGTJobId'].nunique().reset_index()
empl_DM = empl_DM.rename(columns={'BGTJobId': 'DataMgmt'})
employers = employers.merge(empl_DM, on='year')
employers.AI = (employers['AI'] / employers['Employer_ID']) * 100
employers.OldTech = (employers['OldTech'] / employers['Employer_ID']) * 100
employers.DataMgmt = (employers['DataMgmt'] / employers['Employer_ID']) * 100
all_keys = ['AI', 'OldTech', 'DataMgmt']
plt.figure(figsize=(8, 5))
i = 0
for key in all_keys:
    pos = list(range(len(employers['year'])))
    width = 1 / (len(all_keys) + 1)
    plt.bar([p + width * i for p in pos], employers[key], width, alpha=0.75,
            label=key + '_% of employers', color=plots_colors.get(key))
    plt.xticks([p + 0.5 * (len(all_keys) - 2) * width for p in pos], employers['year'], fontsize=18)
    i += 1
plt.ylabel('% of employers', fontsize=18)
plt.xlabel('Year', fontsize=18)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.3), fontsize=18)
plt.yticks(fontsize=18)
plt.savefig(os.path.join(plot_path, 'Figure_3a.jpg'), bbox_inches='tight')
plt.show()
plt.close()
BLS_vars = ['Totalseparations_Rate', 'Jobopenings_Rate', 'Hires_Rate']
BLS_AI = df_dummies.groupby(['isAI', 'yearmonth'])[BLS_vars].mean().reset_index()
BLS_AI = BLS_AI[BLS_AI.isAI == 1]
BLS_AI = BLS_AI.rename(columns={'Totalseparations_Rate': 's_AI', 'Jobopenings_Rate': 'o_AI',
                                'Hires_Rate': 'h_AI'})
BLS_AI['h_o_AI'] = (BLS_AI.h_AI * (1 - BLS_AI.o_AI)) / BLS_AI.o_AI
del BLS_AI['isAI']
BLS_OT = df_dummies.groupby(['isOldTech', 'yearmonth'])[BLS_vars].mean().reset_index()
BLS_OT = BLS_OT[BLS_OT.isOldTech == 1]
BLS_OT = BLS_OT.rename(columns={'Totalseparations_Rate': 's_OldTech', 'Jobopenings_Rate': 'o_OldTech',
                                'Hires_Rate': 'h_OldTech'})
BLS_OT['h_o_OldTech'] = (BLS_OT.h_OldTech * (1 - BLS_OT.o_OldTech)) / BLS_OT.o_OldTech
del BLS_OT['isOldTech']
BLS_DM = df_dummies.groupby(['isDataMgmt', 'yearmonth'])[BLS_vars].mean().reset_index()
BLS_DM = BLS_DM[BLS_DM.isDataMgmt == 1]
BLS_DM = BLS_DM.rename(columns={'Totalseparations_Rate': 's_DataMgmt', 'Jobopenings_Rate': 'o_DataMgmt',
                                'Hires_Rate': 'h_DataMgmt'})
BLS_DM['h_o_DataMgmt'] = (BLS_DM.h_DataMgmt * (1 - BLS_DM.o_DataMgmt)) / BLS_DM.o_DataMgmt
del BLS_DM['isDataMgmt']
BLS_all = pd.merge(BLS_AI, BLS_OT, on=['yearmonth'], how='outer')
BLS_all = BLS_all.merge(BLS_DM, on=['yearmonth'], how='outer')
BLS_all = BLS_all.sort_values('yearmonth')
BLS_all = BLS_all.ffill(axis=0)
# Get w_all from payscale data
fname = f"salaries_pseudo.csv"
w_all = pd.read_csv(os.path.join(data_path, fname))
w_all = Util.col2monthend(w_all, 'yearmonth')
w_all = w_all[~w_all.w_AI.isnull()]
w_all = w_all[w_all.yearmonth.dt.year >= 2015]
w_all = w_all.set_index('yearmonth').resample('M').ffill().reset_index()
# Employer-based analysis
# Calculate hiring by type/employer/month
sample_emp = \
    df_dummies.groupby(['Employer_ID', 'yearmonth'])['isAI', 'isOldTech', 'isDataMgmt'].sum().reset_index()
# Add an empty row for December 2018 for employers that are present in that year so to forward fill
# for the whole year
sample_emp['year'] = sample_emp.yearmonth.apply(lambda j: j.year)
sample_emp['month'] = sample_emp.yearmonth.apply(lambda j: j.month)
employers_to_fill = sample_emp[(sample_emp.year == 2018) & (sample_emp.month != 12)]
for emp in employers_to_fill.Employer_ID.unique():
    sample_emp = sample_emp.append(pd.DataFrame([[emp, dt.datetime(2018, 12, 31), 0, 0, 0, 2018, 12]],
                                                columns=sample_emp.columns))
# Forward fill observations per employer/yearmonth and set new hires to zero
sample_emp['yearmonth_original'] = sample_emp['yearmonth'] + MonthEnd(0)
sample_emp = \
    sample_emp.groupby('Employer_ID').apply(lambda j: j.drop_duplicates('yearmonth')
                                            .set_index('yearmonth').resample('M').ffill()
                                            .reset_index()).reset_index(drop=True)
condition = (sample_emp.yearmonth != sample_emp.yearmonth_original)
cols = ['isAI', 'isOldTech', 'isDataMgmt']
for col in cols:
    sample_emp.loc[condition, col] = 0
# Merge separations information
sample_emp = sample_emp.merge(
    BLS_all[['yearmonth', 's_AI', 's_OldTech', 's_DataMgmt', 'h_o_AI', 'h_o_OldTech',
             'h_o_DataMgmt', 'h_AI', 'h_OldTech', 'h_DataMgmt']],
    on=['yearmonth'])
# Sort new hires information by employer/year/month
sample_emp = sample_emp.sort_values(['Employer_ID', 'yearmonth']).reset_index(drop=True)
sample_emp['year'] = pd.DatetimeIndex(sample_emp.yearmonth).year
sample_emp['month'] = pd.DatetimeIndex(sample_emp.yearmonth).month
sample_emp['min_year'] = sample_emp.groupby('Employer_ID')['year'].transform('min')
sample_emp['min_month'] = sample_emp.groupby('Employer_ID')['month'].transform('min')
# Create equilibrium size value for employers present at the beginning of the sample (2010)
# Rules: - we consider that AI jobs started at the start of the sample
#        - for all other types of jobs:
#           - if the employer existed in 2010 we consider that it was on equilibrium size so we start
#             accumulating from the equilibrium value.
#           - if the employer appears later, we consider it as new and we start accumulating from zero
print('The distribution of the first year in which unique employers appear in the data is:')
print(sample_emp.drop_duplicates('Employer_ID').min_year.value_counts(normalize=True))
job_types = ['AI', 'OldTech', 'DataMgmt']
for job_t in job_types:
    sample_emp['first_' + job_t] = 0
    sample_emp.loc[
        sample_emp['is' + job_t].ne(0).groupby(sample_emp['Employer_ID']).idxmax(),
        'first_' + job_t] = 1
    sample_emp.loc[
        (sample_emp['first_' + job_t] == 1) & (sample_emp['is' + job_t] == 0), 'first_' + job_t] = 0
    sample_emp['L_' + job_t] = 0
    sample_emp['L_ss_' + job_t] = 0
    if job_t != 'AI':
        sample_emp.loc[(sample_emp.year <= 2010) & (sample_emp['first_' + job_t] == 1),
                       'L_ss_' + job_t] = (sample_emp['is' + job_t] * sample_emp['h_o_' + job_t]) / \
                                          sample_emp['s_' + job_t]
        sample_emp.loc[(sample_emp.year > 2010) & (sample_emp['first_' + job_t] == 1),
                       'L_ss_' + job_t] = sample_emp['is' + job_t]
        sample_emp.loc[(sample_emp['first_' + job_t] == 1), 'L_' + job_t] = \
            sample_emp['is' + job_t]
    else:
        sample_emp.loc[(sample_emp['first_' + job_t] == 1), 'L_ss_' + job_t] = \
            sample_emp['is' + job_t]
        sample_emp.loc[(sample_emp['first_' + job_t] == 1), 'L_' + job_t] = \
            sample_emp['is' + job_t]
# Accumulate hiring per employer to get total number of jobs per type
for job_t in job_types:
    print(job_t)
    sample_emp = sample_emp.groupby('Employer_ID').apply(Util.jobs_grp_cumulate, nn='L_ss', jt=job_t) \
        .reset_index(drop=True)
    sample_emp = sample_emp.groupby('Employer_ID').apply(Util.jobs_grp_cumulate, nn='L', jt=job_t) \
        .reset_index(drop=True)
# Show total accumulated jobs per month
jobs_total = sample_emp.groupby('yearmonth')['L_AI', 'L_OldTech', 'L_ss_OldTech', 'L_DataMgmt',
                                             'L_ss_DataMgmt'].sum().reset_index()
jobs_total.plot(x='yearmonth', y=['L_AI', 'L_OldTech', "L_DataMgmt"], color=[plots_colors['AI'],
                       plots_colors['OldTech'], plots_colors['DataMgmt']], figsize=(8, 5))
name = 'Accumulated AI, OldTech and DataMgmt jobs'
plt.legend(['L_AI', 'L_OT', 'L_DM'], fontsize=18)
plt.title(name, fontsize=18)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.ylabel('Number of workers', fontsize=18)
plt.xlabel('Date', fontsize=18)
plt.savefig(os.path.join(plot_path, 'Figure_3b.jpg'), bbox_inches='tight')
plt.show()
jobs_total.plot(x='yearmonth', y=['L_AI', 'L_OldTech', 'L_ss_OldTech', 'L_DataMgmt', 'L_ss_DataMgmt'],
                color=[plots_colors['AI'],
                       plots_colors['OldTech'], plots_colors['OldTech'],
                       plots_colors['DataMgmt'], plots_colors['DataMgmt']],
                style=['-', '-', '.', '-', '.'], figsize=(8, 5))
plt.axvline(x=dt.datetime(2015, 1, 31), color='black', linestyle='-')
name = 'Accumulated AI, OldTech and DataMgmt jobs \n ' \
       '(with & without steady-state initialization)'
plt.legend(['L_AI', 'L_OT', 'L_ss_OT', 'L_DM', 'L_ss_DM'], ncol=2, fontsize=18)
plt.title(name, fontsize=18)
plt.xlabel('Date', fontsize=18)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.savefig(os.path.join(plot_path, 'Accumulated_AI_OldTech_DataMgmt_jobs_all.jpg'),
            bbox_inches='tight')
plt.show()
# Compute average hiring per type/month across employers
cols = ['L_AI', 'L_OldTech', 'L_DataMgmt', 'L_ss_OldTech', 'L_ss_DataMgmt']
new_cols = []
for col in cols:
    col_name = col + '_av'
    sample_emp[col_name] = sample_emp.groupby(['yearmonth'])[col].transform('mean')
    new_cols.extend([col_name])
# Add date index (t) per employer, 1 is the first date for which there is any job of interest available
sample_emp = sample_emp.sort_values(['Employer_ID', 'yearmonth'])
sample_emp['t'] = 1
sample_emp['t'] = sample_emp.groupby('Employer_ID')['t'].transform('cumsum')
sample_emp['t'] = sample_emp['t'] - 1
# Compile all information into DataFrame for parameter estimation
cols = ['Employer_ID', 'yearmonth', 'year', 'month',
        'L_AI', 'L_ss_AI', 'L_OldTech', 'L_ss_OldTech', 'L_DataMgmt', 'L_ss_DataMgmt', 't'] + new_cols
cols1 = ['yearmonth', 'w_AI', 'w_OT', 'w_DM']
w_all['year'] = w_all.yearmonth.dt.year
df_est = pd.merge(sample_emp[cols], w_all[cols1], on='yearmonth')
df_est = df_est.merge(GS1M[['yearmonth', 'GS1M']], on='yearmonth', how='left')
df_est = df_est.sort_values(['Employer_ID', 'yearmonth'])
# Remove initial obs cause they don't have salary info and the accumulation is still converging
df_est = df_est[df_est.yearmonth > dt.datetime(2014, 12, 31)]
# Remove observations for which both AI and OldTech are 0 and for which data management is zero
condition = ((df_est.L_AI == 0) & (df_est.L_OldTech == 0))
df_est = df_est[~condition]
df_est = df_est[df_est.L_DataMgmt != 0]
descriptive_stats = df_est[['L_DataMgmt', 'L_AI', 'L_OldTech']].describe()
descriptive_stats.to_csv(os.path.join(plot_path, 'Table1.csv'))
print('Stock of labor descriptive statistics (Table 1)')
print(descriptive_stats)
print('The number of relevant observations used for the estimation is:', df_est.shape[0])
print('The number of relevant employers used for the estimation is:', df_est.Employer_ID.nunique())
# Save to file
df_est.to_pickle(os.path.join(out_path, "df_est.pkl"))
